Setup

Load packages and directories

Summary

Data Sources

GFW effort, bathymetry (GEBCO), DTS, DTP, SST, Chla

Methods

Fit models to test presence of fishing effort and run random forest to determine variable importance

Predict presence with best fit model

Create rules to spatialise effort into EEZ where industrial effort is mostly likely to be (by gear type and length?)

bathy <- rast(file.path(rdsi_dir, "prep/random_forest/bathymetry_0.1_res.tif"))

dts <- rast(file.path(rdsi_raw_dir, "global_fishing_watch/distance_from_shore/distance-from-shore_0.1.tif"))

dtp <- rast(file.path(rdsi_raw_dir, "global_fishing_watch/distance_from_port/distance-from-port-v1_0.1.tif"))

sst <- rast(file.path(rdsi_raw_dir, "noaa/sst/resample_0.1/sst_2020_0.1.tif"))

chla <- rast(file.path(rdsi_dir, "prep/random_forest/chla_2020_0.1.tif"))

effort_subset <- read.csv(file.path(rdsi_dir, "prep/random_forest/effort_2020_vut_arg_isl.csv")) %>%
  rename(x = cell_ll_lon, y = cell_ll_lat)

eez_rast <- rast(here("int/eez_raster.tif"))

eez_rast_df <- fread(here("int/eez_raster_df.csv")) 

rgn_keys <- read.csv(here("raw/ohi_region_key.csv"))

Start with Argentina model

Prep all rasters to Argentina’s EEZ and have a value for each explanatory variable in each cell

eez_arg_df <- eez_rast_df %>%
  filter(rgn_id == 172) %>%
  dplyr::select(x, y) %>%
  mutate(eez = 1)

eez_arg_rast <- rast(eez_arg_df, crs = crs(eez_rast), type = "xyz")


# depth
bathy_arg <- crop(bathy, eez_arg_rast) %>% # crop to correct region
  project(., eez_arg_rast) %>% # make sure extents match
  mask(., eez_arg_rast) # now mask out any areas not in EEZ

## distance to short
dts_arg <- crop(dts, eez_arg_rast) %>% # crop to correct region
  project(., eez_arg_rast) %>% # make sure extents match
  mask(., eez_arg_rast) # now mask out any areas not in EEZ

# distance to port
dtp_arg <- crop(dtp, eez_arg_rast) %>% # crop to correct region
  project(., eez_arg_rast) %>% # make sure extents match
  mask(., eez_arg_rast) # now mask out any areas not in EEZ

# sst
sst_arg <- crop(sst, eez_arg_rast) %>% # crop to correct region
  project(., eez_arg_rast) %>% # make sure extents match
  mask(., eez_arg_rast) # now mask out any areas not in EEZ

chl_arg <- crop(chla, eez_arg_rast) %>% # crop to correct region
  project(., eez_arg_rast) %>% # make sure extents match
  mask(., eez_arg_rast) %>% # now mask out any areas not in EEZ
  rename(chl = mean)


# effort hours
effort_arg <- effort_subset %>% 
  group_by(x, y) %>%
  summarise(fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  rast(., type = "xyz", crs = crs(eez_arg_rast)) %>%
  crop(., eez_arg_rast) %>%
  project(., eez_arg_rast) %>%
  tidyterra::mutate(fishing_hours = ifelse(is.na(fishing_hours), 0, fishing_hours)) %>%
  mask(., eez_arg_rast) # ok so as.data.frame removes NAs? 
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# now stack all rasters

all_rasts <- c(effort_arg, dtp_arg, dts_arg, bathy_arg, sst_arg, chl_arg) %>%
  as.data.frame(., xy = TRUE, na.rm = NA) %>%
  mutate(fis_pres = as.factor(ifelse(fishing_hours > 0, 1, 0))) %>%
  #dplyr::select(-fishing_hours) %>%
  filter(!is.na(bathymetry),
         !is.na(mean),
         !is.na(chl)) %>% # need to filter out NAs for RF to work 
  rename(dtp = 4, dts = 5, sst = mean) 

all_env_stack <- c(dtp_arg, dts_arg, bathy_arg, sst_arg, chl_arg) %>%
    rename(dtp = 1, dts = 2, sst = mean) 

Start with random forest model on presence of fishing effort

Make models with magnitude of hours

all_rasts <- all_rasts %>%
  filter(fishing_hours > 0)

# all_rasts_stack <- rast(all_rasts, type = "xyz")

## lets check the hours of effort against predictor variables
par(mfrow=c(3,2))
for(i in 4:8){
  plot(all_rasts[,1]~all_rasts[,i], ylab="Effort hours", xlab=names(all_rasts)[i], main=paste0("Effort vs ",names(all_rasts)[i]))
  l <- loess(all_rasts[,1]~all_rasts[,i])
  j <- order(all_rasts[,i]) ## values need to be ordered for the red line to run along and not zigzag
  lines(all_rasts[j,i],l$fitted[j], col="red")
}

## ok so depth is the most obvious polynomial. Potentially chl and sst. Let's just rock with depth to start

## Rounding will introduce only little error, so we'll round the values to be able to use poisson and negative binomial models, and compare each method with each other (e.g. poisson and gaussian model)
all_rasts$fishing_hours <- ceiling(all_rasts$fishing_hours)

##
effort.formula <- formula("fishing_hours ~ bathymetry+sst+chl+I(bathymetry^2)+dtp")
## we also setup the formula for the lognormal model, which uses the log-transformed response:
effort.formula.ln <- formula("log(fishing_hours) ~ bathymetry+sst+chl+I(bathymetry^2)+dtp")

## GLMs

## fitting the model and stepwise dropping variables using stepAIC, and the option "trace=FALSE" to not display each step
fit.n <- stepAIC(glm(effort.formula, data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
## check the summary and diagnostics
summary(fit.n)
## 
## Call:
## glm(formula = fishing_hours ~ sst + chl + I(bathymetry^2) + dtp, 
##     family = gaussian(link = "identity"), data = all_rasts)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -231.7   -77.8   -40.9     0.7  9392.0  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.500e+02  1.883e+01  -7.970 1.82e-15 ***
## sst              8.771e+00  1.340e+00   6.545 6.33e-11 ***
## chl              1.871e+01  2.605e+00   7.183 7.44e-13 ***
## I(bathymetry^2) -1.134e-05  2.608e-06  -4.348 1.39e-05 ***
## dtp              4.493e-01  3.251e-02  13.822  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 77060.82)
## 
##     Null deviance: 611387028  on 7651  degrees of freedom
## Residual deviance: 589284070  on 7647  degrees of freedom
## AIC: 107825
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))

plot(fit.n)

## AIC is realllllly big. This is a horrible model. 

## lognormal
fit.ln <- stepAIC(glm("log(ceiling(fishing_hours)) ~ bathymetry+sst+chl+I(bathymetry^2)+dtp", data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
summary(fit.ln)
## 
## Call:
## glm(formula = log(ceiling(fishing_hours)) ~ bathymetry + sst + 
##     chl + dtp, family = gaussian(link = "identity"), data = all_rasts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1445  -1.3387  -0.0525   1.2512   7.0307  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.098e+00  1.172e-01  -9.372   <2e-16 ***
## bathymetry   8.584e-04  5.369e-05  15.987   <2e-16 ***
## sst          1.921e-01  8.326e-03  23.069   <2e-16 ***
## chl          1.471e-01  1.642e-02   8.959   <2e-16 ***
## dtp          6.248e-03  2.078e-04  30.064   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3.016351)
## 
##     Null deviance: 27626  on 7651  degrees of freedom
## Residual deviance: 23066  on 7647  degrees of freedom
## AIC: 30171
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit.ln)

mu <- fitted(fit.ln)
sigma <- summary(fit.ln)$df.residual/nrow(all_rasts)*sqrt(summary(fit.ln)$dispersion)
-2*sum(log(dlnorm(all_rasts$fishing_hours,mu,sigma)))+2*(length(coef(fit.ln))+1)
## [1] 69875.41
## this one is a bit better, but still not great. AIC is lower and plots look not as skewed

## poisson

fit.p <- stepAIC(glm(effort.formula, data=all_rasts, family=poisson()), trace=FALSE)
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: fitted rates numerically 0 occurred
summary(fit.p)
## 
## Call:
## glm(formula = fishing_hours ~ bathymetry + sst + chl + I(bathymetry^2) + 
##     dtp, family = poisson(), data = all_rasts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -24.696   -9.766   -6.361   -1.321  255.025  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.537e-01  1.156e-02    13.3   <2e-16 ***
## bathymetry      -3.122e-03  2.322e-05  -134.5   <2e-16 ***
## sst              1.927e-01  7.303e-04   263.9   <2e-16 ***
## chl              1.340e-01  7.450e-04   179.9   <2e-16 ***
## I(bathymetry^2) -3.044e-06  2.261e-08  -134.6   <2e-16 ***
## dtp              5.862e-03  1.443e-05   406.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1898865  on 7651  degrees of freedom
## Residual deviance: 1596592  on 7646  degrees of freedom
## AIC: 1630853
## 
## Number of Fisher Scoring iterations: 9
par(mfrow=c(2,2))
plot(fit.p)

## yikes...  by far the worst

fit.rf <- randomForest(effort.formula, data=all_rasts, ntree=500)
fit.rf
## 
## Call:
##  randomForest(formula = effort.formula, data = all_rasts, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 52258.82
##                     % Var explained: 34.59
par(mfrow=c(2,1))
plot(fit.rf)
varImpPlot(fit.rf) # interesting, dtp is most important according to this. Looks like depth isn't so important anymore (this would probably change if split by gear type, species, etc)

## GAMs - add smooth spatial term

fit.gam <- gam(ceiling(fishing_hours) ~ bathymetry+sst+I(bathymetry^2)+chl+ dtp+ s(x,y), data=all_rasts, family=nb())
summary(fit.gam)
## 
## Family: Negative Binomial(0.527) 
## Link function: log 
## 
## Formula:
## ceiling(fishing_hours) ~ bathymetry + sst + I(bathymetry^2) + 
##     chl + dtp + s(x, y)
## 
## Parametric coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.170e+01  7.746e-01  15.104  < 2e-16 ***
## bathymetry      -2.830e-04  1.668e-04  -1.696   0.0898 .  
## sst             -7.503e-01  6.339e-02 -11.835  < 2e-16 ***
## I(bathymetry^2) -1.658e-07  4.190e-08  -3.958 7.56e-05 ***
## chl              4.046e-01  2.783e-02  14.539  < 2e-16 ***
## dtp              1.016e-03  9.146e-04   1.111   0.2664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df Chi.sq p-value    
## s(x,y) 28.54  28.98   2708  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0425   Deviance explained = 34.9%
## -REML =  35635  Scale est. = 1         n = 7652
par(mfrow=c(2,2))
plot(fit.gam)

## R2 not great... but not a huge deal. Not really sure how to interpret the plot... 


fit.gam2 <- gam(ceiling(fishing_hours) ~ s(bathymetry)+s(sst)+s(chl)+s(x, y), data=all_rasts, family=nb())
summary(fit.gam2)
## 
## Family: Negative Binomial(0.584) 
## Link function: log 
## 
## Formula:
## ceiling(fishing_hours) ~ s(bathymetry) + s(sst) + s(chl) + s(x, 
##     y)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.64534    0.01523   239.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df Chi.sq p-value    
## s(bathymetry)  8.743  8.980  321.2  <2e-16 ***
## s(sst)         8.845  8.990  807.8  <2e-16 ***
## s(chl)         8.758  8.982  569.4  <2e-16 ***
## s(x,y)        28.680 28.986 3936.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0803   Deviance explained = 42.2%
## -REML =  35156  Scale est. = 1         n = 7652
par(mfrow=c(2,2))

plot(fit.gam2)

## seems a bit better? Based on R2 and deviance explained... however everythign is significant, so maybe not

Make predictions

#extract both mean and se of predictions
predfun <- function(model, data) {
  v <- predict(model, data, type="response", se.fit=TRUE)
  cbind(p=as.vector(v$fit), se=as.vector(v$se.fit))
}

## for the lognormal:
predfun.ln <- function(model, data) {
  v <- predict(model, data, type="response", se.fit=TRUE)
  cbind(p=as.vector(exp(v$fit)), se=as.vector(exp(v$fit+v$se.fit)-exp(v$fit)))
}

#for the gams, we need to create a raster of lons and lats to predict on
ra.lon <- raster(all_env_stack$bathymetry)
ra.lon[] <- coordinates(stack(all_env_stack)$bathymetry)[,1]
ra.lat <- raster(all_env_stack$bathymetry)
ra.lat[] <- coordinates(stack(all_env_stack)$bathymetry)[,2]
ra.lon <- rast(ra.lon)
ra.lat <- rast(ra.lat)
all_env_stack$x <- ra.lon
all_env_stack$y <- ra.lat


#generate prediction maps
pred.n <- terra::predict(all_env_stack, fit.n, fun=predfun, index=1:2)
pred.ln <- predict(all_env_stack, fit.ln, fun=predfun.ln, index=1:2)
pred.p <-    predict(all_env_stack, fit.p, fun=predfun, index=1:2)
# pred.nb <-   predict(all_rasts_stack, fit.nb, fun=predfun, index=1:2)
pred.gam <-  predict(all_env_stack, fit.gam, fun=predfun, index=1:2)
pred.gam2 <-  predict(all_env_stack, fit.gam2, fun=predfun, index=1:2)
names(pred.ln) <- names(pred.p) <- names(pred.gam) <- names(pred.gam2) <- c("mean prediction", "std error")

## brt prediction is a bit different here
all_env_stack$bathy2 <- all_env_stack$bathymetry^2

## rf prediction
pred.rf <-   predict(all_env_stack, fit.rf, type="response")

#some plotting parameters:
pt.cex <- c(sqrt(10/pi)/2,sqrt(50/pi)/2,sqrt(100/pi)/2)
cex <- sqrt(all_rasts$fishing_hours/pi)/2
l <- c(10,50,100)

Now plot the predictions!

fish_eff_rast <- all_rasts %>% 
  dplyr::select(x, y, fishing_hours) %>%
  rast(., type = "xyz")

## normal
## Plot spatial predictions and overlay observed abundance  
par(mfrow=c(1,3))
## mean
plot(pred.n,1, main="normal - mean")
## standard error
plot(pred.n,2, main="normal - se")
plot(fish_eff_rast, main = "Observed hours")

## lognormal
par(mfrow=c(1,3))
## mean
plot(pred.ln,1, main="lognormal - mean")
## standard error
plot(pred.ln,2, main="lognormal - se")
plot(fish_eff_rast, main = "Observed hours")

## poisson
par(mfrow=c(1,3))
## mean
plot(pred.p,1, main="poisson - mean")
## standard error
plot(pred.p,2, main="poisson - se")
plot(fish_eff_rast, main = "Observed hours")

## gam
par(mfrow=c(1,3))
## mean
plot(pred.gam,1, main="GAM - mean")
## standard error
plot(pred.gam,2, main="GAM - se")
plot(fish_eff_rast, main = "Observed hours")

## gam with smoothing
par(mfrow=c(1,3))
## mean
plot(pred.gam2,1, main="GAM (smoothed) - mean")
## standard error
plot(pred.gam2,2, main="GAM (smoothed) - se")
plot(fish_eff_rast, main = "Observed hours")

# i think this looks the best 


## random forest
par(mfrow=c(1,2))
## mean
plot(pred.rf,1, main="Random Forest - mean")
plot(fish_eff_rast, main = "Observed hours") 

## ok this is the best by far

## is the RF overfit? 
# Is fishing hugging the EEZ? 

Do same analysis but with Iceland since they have the best coverage

eez_isl_df <- eez_rast_df %>%
  filter(rgn_id == 143) %>%
  dplyr::select(x, y) %>%
  mutate(eez = 1)

eez_isl_rast <- rast(eez_isl_df, crs = crs(eez_rast), type = "xyz")


# depth
bathy_isl <- crop(bathy, eez_isl_rast) %>% # crop to correct region
  project(., eez_isl_rast) %>% # make sure extents match
  mask(., eez_isl_rast) # now mask out any areas not in EEZ

## distance to short
dts_isl <- crop(dts, eez_isl_rast) %>% # crop to correct region
  project(., eez_isl_rast) %>% # make sure extents match
  mask(., eez_isl_rast) # now mask out any areas not in EEZ

# distance to port
dtp_isl <- crop(dtp, eez_isl_rast) %>% # crop to correct region
  project(., eez_isl_rast) %>% # make sure extents match
  mask(., eez_isl_rast) # now mask out any areas not in EEZ

# sst
sst_isl <- crop(sst, eez_isl_rast) %>% # crop to correct region
  project(., eez_isl_rast) %>% # make sure extents match
  mask(., eez_isl_rast) # now mask out any areas not in EEZ

chl_isl <- crop(chla, eez_isl_rast) %>% # crop to correct region
  project(., eez_isl_rast) %>% # make sure extents match
  mask(., eez_isl_rast) %>% # now mask out any areas not in EEZ
  rename(chl = mean)


# effort hours
effort_isl <- effort_subset %>% 
  group_by(x, y) %>%
  summarise(fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  rast(., type = "xyz", crs = crs(eez_isl_rast)) %>%
  crop(., eez_isl_rast) %>%
  project(., eez_isl_rast) %>%
  tidyterra::mutate(fishing_hours = ifelse(is.na(fishing_hours), 0, fishing_hours)) %>%
  mask(., eez_isl_rast) # ok so as.data.frame removes NAs? 
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# now stack all rasters

all_rasts <- c(effort_isl, dtp_isl, dts_isl, bathy_isl, sst_isl, chl_isl) %>%
  as.data.frame(., xy = TRUE, na.rm = NA) %>%
  mutate(fis_pres = as.factor(ifelse(fishing_hours > 0, 1, 0))) %>%
  #dplyr::select(-fishing_hours) %>%
  filter(!is.na(bathymetry),
         !is.na(mean),
         !is.na(chl)) %>% # need to filter out NAs for RF to work 
  rename(dtp = 4, dts = 5, sst = mean) 

all_env_stack <- c(dtp_isl, dts_isl, bathy_isl, sst_isl, chl_isl) %>%
    rename(dtp = 1, dts = 2, sst = mean) 
## first lets look at correlations to see if we should remove any variables
chart.Correlation(all_rasts[,4:8])
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

## dts and dtp are really highly correlated.. makes a lot of sense - will want to remove one of those. My guess is distance to port is more important for Argentina, considering they have a lot of industrial fishing
all_rasts_2 <- all_rasts %>% dplyr::select(-dts)

chart.Correlation(all_rasts_2[,4:7]) # ok, now depth and dtp are correlated.. For this simple analysis, I'll keep depth
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

chart.Correlation(all_rasts_2[,5:7]) # look ok now
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

Make models with magnitude of hours

all_rasts <- all_rasts %>%
  filter(fishing_hours > 0)

## lets check the hours of effort against predictor variables
par(mfrow=c(3,2))
for(i in 4:8){
  plot(all_rasts[,1]~all_rasts[,i], ylab="Effort hours", xlab=names(all_rasts)[i], main=paste0("Effort vs ",names(all_rasts)[i]))
  l <- loess(all_rasts[,1]~all_rasts[,i])
  j <- order(all_rasts[,i]) ## values need to be ordered for the red line to run along and not zigzag
  lines(all_rasts[j,i],l$fitted[j], col="red")
}

## ok so sst is the most obvious polynomial.

## Rounding will introduce only little error, so we'll round the values to be able to use poisson and negative binomial models, and compare each method with each other (e.g. poisson and gaussian model)
all_rasts$fishing_hours <- ceiling(all_rasts$fishing_hours)

##
effort.formula <- formula("fishing_hours ~ bathymetry+sst+chl+I(sst^2)")
## we also setup the formula for the lognormal model, which uses the log-transformed response:
effort.formula.ln <- formula("log(fishing_hours) ~ bathymetry+sst+chl+I(sst^2)")

## GLMs

## fitting the model and stepwise dropping variables using stepAIC, and the option "trace=FALSE" to not display each step
fit.n <- stepAIC(glm(effort.formula, data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
## check the summary and diagnostics
summary(fit.n)
## 
## Call:
## glm(formula = fishing_hours ~ bathymetry + sst + chl + I(sst^2), 
##     family = gaussian(link = "identity"), data = all_rasts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -653.55   -63.20   -28.59    24.62  2024.70  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -86.845122  15.648895  -5.550 2.98e-08 ***
## bathymetry    0.053140   0.003492  15.217  < 2e-16 ***
## sst          39.770699   4.794373   8.295  < 2e-16 ***
## chl          90.399407   4.561383  19.818  < 2e-16 ***
## I(sst^2)     -3.155503   0.386519  -8.164 3.90e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18508.85)
## 
##     Null deviance: 138571608  on 6134  degrees of freedom
## Residual deviance: 113459244  on 6130  degrees of freedom
## AIC: 77700
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))

plot(fit.n)

## AIC is realllllly big. This is a horrible model. 

## lognormal
fit.ln <- stepAIC(glm("log(ceiling(fishing_hours)) ~ bathymetry+sst+chl+I(sst^2)", data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
summary(fit.ln)
## 
## Call:
## glm(formula = log(ceiling(fishing_hours)) ~ bathymetry + sst + 
##     chl + I(sst^2), family = gaussian(link = "identity"), data = all_rasts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.4044  -0.9915   0.1312   1.0554   4.3309  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.916996   0.165822    5.53 3.33e-08 ***
## bathymetry   0.001755   0.000037   47.44  < 2e-16 ***
## sst          0.767881   0.050803   15.12  < 2e-16 ***
## chl          0.912231   0.048334   18.87  < 2e-16 ***
## I(sst^2)    -0.061727   0.004096  -15.07  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.078247)
## 
##     Null deviance: 24102  on 6134  degrees of freedom
## Residual deviance: 12740  on 6130  degrees of freedom
## AIC: 21905
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit.ln)

mu <- fitted(fit.ln)
sigma <- summary(fit.ln)$df.residual/nrow(all_rasts)*sqrt(summary(fit.ln)$dispersion)
-2*sum(log(dlnorm(all_rasts$fishing_hours,mu,sigma)))+2*(length(coef(fit.ln))+1)
## [1] 57602.44
## this one is a bit better, but still not great. AIC is lower and plots look not as skewed

## poisson

fit.p <- stepAIC(glm(effort.formula, data=all_rasts, family=poisson()), trace=FALSE)
summary(fit.p)
## 
## Call:
## glm(formula = fishing_hours ~ bathymetry + sst + chl + I(sst^2), 
##     family = poisson(), data = all_rasts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -38.611   -7.638   -2.844    1.577   99.817  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.008e+00  2.612e-02  153.47   <2e-16 ***
## bathymetry   2.855e-03  9.296e-06  307.14   <2e-16 ***
## sst          3.539e-01  7.773e-03   45.52   <2e-16 ***
## chl          2.316e-01  2.490e-03   93.01   <2e-16 ***
## I(sst^2)    -2.689e-02  5.931e-04  -45.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 988872  on 6134  degrees of freedom
## Residual deviance: 596793  on 6130  degrees of freedom
## AIC: 626177
## 
## Number of Fisher Scoring iterations: 6
par(mfrow=c(2,2))
plot(fit.p)

## yikes...  still bad


## Negative binomial

fit.nb <- stepAIC(glm.nb(effort.formula, data=all_rasts), trace=FALSE)
summary(fit.nb)
## 
## Call:
## glm.nb(formula = fishing_hours ~ bathymetry + sst + chl + I(sst^2), 
##     data = all_rasts, init.theta = 0.7168794565, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0915  -1.1361  -0.5101   0.1812   6.2808  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.194e+00  1.433e-01   15.31   <2e-16 ***
## bathymetry   1.936e-03  3.477e-05   55.68   <2e-16 ***
## sst          6.615e-01  4.384e-02   15.09   <2e-16 ***
## chl          8.145e-01  4.007e-02   20.33   <2e-16 ***
## I(sst^2)    -5.183e-02  3.546e-03  -14.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.7169) family taken to be 1)
## 
##     Null deviance: 12493.1  on 6134  degrees of freedom
## Residual deviance:  7054.1  on 6130  degrees of freedom
## AIC: 58406
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.7169 
##           Std. Err.:  0.0118 
## 
##  2 x log-likelihood:  -58393.6060
par(mfrow=c(2,2))
plot(fit.nb)

# overfit? 

fit.rf <- randomForest(effort.formula, data=all_rasts, ntree=500)
fit.rf
## 
## Call:
##  randomForest(formula = effort.formula, data = all_rasts, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 12503.08
##                     % Var explained: 44.64
par(mfrow=c(2,1))
plot(fit.rf) # depth by far most important

## GAMs - add smooth spatial term

fit.gam <- gam(ceiling(fishing_hours) ~ bathymetry+sst+I(sst^2)+chl+ s(x,y), data=all_rasts, family=nb())
summary(fit.gam)
## 
## Family: Negative Binomial(0.918) 
## Link function: log 
## 
## Formula:
## ceiling(fishing_hours) ~ bathymetry + sst + I(sst^2) + chl + 
##     s(x, y)
## 
## Parametric coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.672e+00  2.345e-01   7.129 1.01e-12 ***
## bathymetry   2.165e-03  7.193e-05  30.093  < 2e-16 ***
## sst          1.475e+00  7.047e-02  20.923  < 2e-16 ***
## I(sst^2)    -1.396e-01  7.405e-03 -18.857  < 2e-16 ***
## chl         -1.617e-02  5.209e-02  -0.310    0.756    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df Chi.sq p-value    
## s(x,y) 28.29  28.97   2513  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.228   Deviance explained = 57.2%
## -REML =  28360  Scale est. = 1         n = 6135
par(mfrow=c(2,2))

plot(fit.gam)

## R2 not great... but not a huge deal. Not really sure how to interpret the plot... 


fit.gam2 <- gam(ceiling(fishing_hours) ~ s(bathymetry)+s(sst)+s(chl)+s(x, y), data=all_rasts, family=nb())
summary(fit.gam2)
## 
## Family: Negative Binomial(0.981) 
## Link function: log 
## 
## Formula:
## ceiling(fishing_hours) ~ s(bathymetry) + s(sst) + s(chl) + s(x, 
##     y)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.5069     0.0136   257.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df  Chi.sq p-value    
## s(bathymetry)  8.231  8.842  977.41  <2e-16 ***
## s(sst)         7.412  8.393  457.18  <2e-16 ***
## s(chl)         7.136  8.204   76.08  <2e-16 ***
## s(x,y)        28.294 28.942 1981.12  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.232   Deviance explained = 60.8%
## -REML =  28108  Scale est. = 1         n = 6135
par(mfrow=c(2,2))

plot(fit.gam2)

## seems a bit better? Based on R2 and deviance explained... however everythign is significant, so maybe not

Make predictions

#extract both mean and se of predictions
predfun <- function(model, data) {
  v <- predict(model, data, type="response", se.fit=TRUE)
  cbind(p=as.vector(v$fit), se=as.vector(v$se.fit))
}

## for the lognormal:
predfun.ln <- function(model, data) {
  v <- predict(model, data, type="response", se.fit=TRUE)
  cbind(p=as.vector(exp(v$fit)), se=as.vector(exp(v$fit+v$se.fit)-exp(v$fit)))
}

#for the gams, we need to create a raster of lons and lats to predict on
ra.lon <- raster(all_env_stack$bathymetry)
ra.lon[] <- coordinates(stack(all_env_stack)$bathymetry)[,1]
ra.lat <- raster(all_env_stack$bathymetry)
ra.lat[] <- coordinates(stack(all_env_stack)$bathymetry)[,2]
ra.lon <- rast(ra.lon)
ra.lat <- rast(ra.lat)
all_env_stack$x <- ra.lon
all_env_stack$y <- ra.lat


#generate prediction maps
pred.n <- terra::predict(all_env_stack, fit.n, fun=predfun, index=1:2)
pred.ln <- predict(all_env_stack, fit.ln, fun=predfun.ln, index=1:2)
pred.p <-    predict(all_env_stack, fit.p, fun=predfun, index=1:2)
pred.nb <-   predict(all_env_stack, fit.nb, fun=predfun, index=1:2)
pred.gam <-  predict(all_env_stack, fit.gam, fun=predfun, index=1:2)
pred.gam2 <-  predict(all_env_stack, fit.gam2, fun=predfun, index=1:2)
names(pred.ln) <- names(pred.p) <- names(pred.gam) <- names(pred.gam2) <- c("mean prediction", "std error")

## brt prediction is a bit different here
all_env_stack$bathy2 <- all_env_stack$bathymetry^2

## rf prediction
pred.rf <-   predict(all_env_stack, fit.rf, type="response")

#some plotting parameters:
pt.cex <- c(sqrt(10/pi)/2,sqrt(50/pi)/2,sqrt(100/pi)/2)
cex <- sqrt(all_rasts$fishing_hours/pi)/2
l <- c(10,50,100)

Now plot the predictions!

fish_eff_rast <- all_rasts %>% 
  dplyr::select(x, y, fishing_hours) %>%
  rast(., type = "xyz")

## normal
## Plot spatial predictions and overlay observed abundance  
par(mfrow=c(1,3))
## mean
plot(pred.n,1, main="normal - mean")
## standard error
plot(pred.n,2, main="normal - se")
plot(fish_eff_rast, main = "Observed hours")

## lognormal
par(mfrow=c(1,3))
## mean
plot(pred.ln,1, main="lognormal - mean")
## standard error
plot(pred.ln,2, main="lognormal - se")
plot(fish_eff_rast, main = "Observed hours")

# yikes

## poisson
par(mfrow=c(1,3))
## mean
plot(pred.p,1, main="poisson - mean")
## standard error
plot(pred.p,2, main="poisson - se")
plot(fish_eff_rast, main = "Observed hours")

## gam
par(mfrow=c(1,3))
## mean
plot(pred.gam,1, main="GAM - mean")
## standard error
plot(pred.gam,2, main="GAM - se")
plot(fish_eff_rast, main = "Observed hours")

## gam with smoothing
par(mfrow=c(1,3))
## mean
plot(pred.gam2,1, main="GAM (smoothed) - mean")
## standard error
plot(pred.gam2,2, main="GAM (smoothed) - se")
plot(fish_eff_rast, main = "Observed hours")

# i think this looks the best 


## random forest
par(mfrow=c(1,2))
## mean
plot(pred.rf,1, main="Random Forest - mean predictions")
plot(fish_eff_rast, main = "Observed hours") 

## ok this is the best by far but is it overfit?